##R code for models and analysis reported on in Watling et al. (2012) Ecological Modelling 246:79--85## ##Code for random forest models## library(randomForest) ##Load R libraries for analysis## library(RSAGA) #To create master prediction map# data <-read.table("C:/BioClim/American_Crocodile/croc_acut_full.txt", header=T) ##Add and attach file of climate data from grid cells indicated by georeferenced presences and pseudo-absences## attach(data) test <-randomForest(as.factor(species1)~BIO2+BIO5+BIO6+BIO8+BIO14+BIO15+BIO16+BIO18+BIO19, ntree=500, importance=T) ##Run the random forest model## var1pred <-scan("C:/BioClim/American_Crocodile/bio2.asc", skip=6) ##Load climate grids for each variable included in the model## var2pred <-scan("C:/BioClim/American_Crocodile/bio5.asc", skip=6) var3pred <-scan("C:/BioClim/American_Crocodile/bio6.asc", skip=6) var4pred <-scan("C:/BioClim/American_Crocodile/bio8.asc", skip=6) var5pred <-scan("C:/BioClim/American_Crocodile/bio14.asc", skip=6) var6pred <-scan("C:/BioClim/American_Crocodile/bio15.asc", skip=6) var7pred <-scan("C:/BioClim/American_Crocodile/bio16.asc", skip=6) var8pred <-scan("C:/BioClim/American_Crocodile/bio18.asc", skip=6) var9pred <-scan("C:/BioClim/American_Crocodile/bio19.asc", skip=6) var1pred[var1pred == -9999] = NA ##Assign N/A values from the climate grids## var2pred[var2pred == -9999] = NA var3pred[var3pred == -9999] = NA var4pred[var4pred == -9999] = NA var5pred[var5pred == -9999] = NA var6pred[var6pred == -9999] = NA var7pred[var7pred == -9999] = NA var8pred[var8pred == -9999] = NA var9pred[var9pred == -9999] = NA predict.data <-data.frame(var1pred, var2pred, var3pred, var4pred, var5pred, var6pred, var7pred, var8pred, var9pred) ##Create a data frame with all the climate data## names(predict.data) <-c("BIO2", "BIO5", "BIO6", "BIO8", "BIO14", "BIO15", "BIO16", "BIO18", "BIO19") ##Assign names to climate variables in newly-created data frame, being sure to use the same names as used in the random forest model run above## pred.1 <-predict(test, predict.data, type="prob", se.fit=F) ##Extrapolate the model ('test') based on the climate conditions represented in the 'predict.data' data frame## pred.2 <- pred.1[,2] ##Extract the cell-by-cell probabilities from the extrapolation file 'pred.1'## pred.3 <-na.exclude(pred.2) ##Exclude the N/A values from the prediction## dummy.null <-c(var1pred) ##Create a dummy vector with as many observations as the full climate grids used for extrapolation## dummy.null[is.na(dummy.null)]<- -9999 ##Assign the N/A values in the dummy vector as -9999## new <-replace(dummy.null, dummy.null > -9999, pred.3) ##Replace the non-N/A values in the dummy vector with the non-N/A predictions from the extrapolation file pred.1## write.table(new, file="C:/BioClim/Revision_Nov_2011/BioClim/American_Crocodile/temp.txt") ##Save the dummy vector as a data file in vector form## data <-read.table("C:/BioClim/Revision_Nov_2011/BioClim/American_Crocodile/temp.txt", header=T) ##Read the vector data back into R## data.2 <-data[,1] ##Extract the probabilities from the vector file## hdr = list(ncols=447, nrows=449, xllcorner= -109.1871509552, yllcorner=-32.339998245239, cellsize=0.167, nodata_value = -9999) ##Create a header row of metadata for the prediction map, based on the data from the climate asciis 'var1pred', etc## data.mat <-matrix(data.2, nrow=449, ncol=447, byrow=T) ##Format the vector file as a matrix with row and column numbers as in the climate ascii files## write.ascii.grid(data.mat, "C:/BioClim/Revision_Nov_2011/BioClim/American_Crocodile/bioclim_pred_map.asc", header= hdr, digits = 6, georef = "corner") ##Save the prediction map as an ascii file## ##The code below calculates AUC and Cohen's kappa for model validation based on 100 random partitions of the occurrence data## library(PresenceAbsence)##Load additional R libraries## library(e1071) random.data <-read.table("C:/BioClim/American_Crocodile/random_points.txt", header=T) ##Add file of climate data from grid cells representing randomly-selected pseudo-absences## data.full <-read.table("C:/BioClim/American_Crocodile/Bioclim_files_July_2012/american_crocodile_all.txt", header=T) ##Add and attach files of climate data from grid cells representing species presences## attach(data.full) x<-seq(1, 100, by= 1) ##create sequence of 100 items## for (j in x){ ##Set up loop to repeat procedure 100 times## random <-random.data[sample((1:10000),10000),] ##Randomize the pseudo-absence data## random.train <-random[1:7500,] ##Create a training subset with 75% of the pseudo-absence data## random.test <-random[7501:10000,] ##Create a test subset with 25% of the pseudo-absence data## subset <-data.full[sample((1:120),120),] ##Randomize the presence data## pres.train <-subset[1:90,] ##Create a training subset with 75% of the presence data## pres.test <-subset[91:120,] ##Create a test subset with 25% of the presence data## train.data <-rbind(pres.train, random.train) ##Append the training presence subset to a 75% training subset of the pseudo-absence data## test.data <-rbind(pres.test, random.test) ##Append the test presence subset to a 25% test subset of the pseudo-absence data## rf.model <-randomForest(as.factor(species)~BIO2+BIO5+BIO6+BIO8+BIO14+BIO15+BIO16+BIO18+BIO19, ntree=500, importance=T, data=train.data) ##Run random forest model with training data## rf.predict <-predict(rf.model, test.data, type="prob") rf.predict.2 <-rf.predict[,2] ##Extract the cell by cell probabilities## ##Routine to calculate AUC## roc <-function (test.data, rf.predict.2){ if (length(test.data) != length(rf.predict.2)) stop("obs and preds must be equal lengths") n.x <- length(test.data[test.data == 0]) n.y <- length(test.data[test.data == 1]) xy <- c(rf.predict.2[test.data == 0], rf.predict.2[test.data == 1]) rnk <- rank(xy) roc <- ((n.x * n.y) + ((n.x * (n.x + 1))/2) - sum(rnk[1:n.x]))/(n.x * n.y) } AUC.output <-roc(test.data[,1], rf.predict.2) ##AUC value## mat.1<-data.frame(1:2530, test.data[,1], rf.predict.2) ##Create data frame with probabilistic predictions from the model## conmat.b <- cmx(mat.1, threshold=0.15) ##Create a confusion matrix based on assigned threshold, calculated using code below## kappa.1<-classAgreement(conmat.b) ##Calculate kappa## kappa.2<-kappa.1["kappa"] ##Extract kappa value## capture.output(kappa.2, file="C:/BioClim/Revision_Nov_2011/Bioclim/American_Crocodile/kappa_test_july_2012.txt", append=T) ##Save kappa and AUC values to file## capture.output(AUC.output, file="C:/BioClim/Revision_Nov_2011/Bioclim/American_Crocodile/AUC_july_2012.txt", append=T) if (j== 100) break } ##Code to calculate threshold for threshold-dependent metrics by calculating five replicate estimates of kappa for each 0.01 change in threshold between 0 and 1## x<-seq(.01, 0.99, by=0.01)##Create a vector of probabilities from 0.01 to 0.99)## ##Routine to calculate five replicate estimates of kappa for each 0.01 change in the threshold between 0.01 and 0.99## for (j in x){ filename <-"dataset.txt" for (i in 1:5) { subset <-data.full[sample((1:120),120),] ##Randomize the presence data## training <-subset[1:90,] ##Create a training subset with 75% of the presence data## test <-subset[91:120,] ##Create a test subset with 25% of the presence data## random.subset <-random.data[sample((1:10000),10000),] ##Randomize the pseudo-absence data## training.data <-rbind(training, random.subset[1:7500,]) ##Append the training presence subset to a 75% training subset of the pseudo-absence data## test.data <-rbind(test, random.subset[7501:10000,]) ##Append the test presence subset to a 25% test subset of the pseudo-absence data## rf.model <-randomForest(as.factor(species)~temp_apr+temp_dec+temp_feb+temp_jan+temp_mar+temp_may+temp_nov+temp_oct+temp_sep, ntree=500, importance=T, data=training.data) ##Run random forest model with training data## rf.predict <-predict(rf.model, test.data, type="prob")##Create prediction with the test data## rf.predict.2 <-rf.predict[,2]##Extract the cell by cell probabilities## mat.1<-data.frame(1:2530, test.data[,1], rf.predict.2)##Create data frame with probabilistic predictions from the model## conmat.b <- cmx(mat.1, threshold= j) ##Create a confusion matrix based on assigned threshold## kappa.1<-classAgreement(conmat.b) ##Calculate kappa## kappa.2<-kappa.1["kappa"]##Extract kappa value## capture.output(kappa.2, file="C:/BioClim/Revision_Nov_2011/Bioclim/American_Crocodile/kappa_threshold.txt", append=T)##Save kappa value to file## if (i ==5) break } if (j== 0.99) break } ##The code above can be used to create models and prediction maps for monthly and bioclimate predictor variables## ##Code to calculate spatial correlation between monthly and bioclimate prediction maps## bio<-scan("C:/BioClim/Revision_Nov_2011/BioClim/American_Crocodile/bioclim_pred_map.asc", skip=6) ##Scan bioclimate prediction map## bio.col <-c(bio) ##Reorganize data as vector## bio.col[bio.col==-9999] = NA ##Assign N/A values## mon<-scan("C:/BioClim/Revision_Nov_2011/Monthly/American_Crocodile/bioclim_pred_map.asc", skip=6) ##Scan monthly prediction map## mon.col<-c(mon) ##Reorganize data as vector## mon.col[mon.col==-9999] = NA ##Assign N/A values## spatial.cor <-cor(bio.col, mon.col, use="na.or.complete") ##Calculate spatial correlation## spatial.cor ##Return spatial correlation## ##The code below is as above, but for GLM models## #To create master prediction map# data <-read.table("C:/BioClim/American_Crocodile/croc_acut_full.txt", header=T) ##Add and attach file of climate data from grid cells indicated by georeferenced presences and pseudo-absences## attach(data) test <-glm(as.factor(species1)~BIO2+BIO5+BIO6+BIO8+BIO14+BIO15+BIO16+BIO18+BIO19, family="binomial") ##Run the GLM model## var1pred <-scan("C:/BioClim/American_Crocodile/bio2.asc", skip=6) ##Load climate grids for each variable included in the model## var2pred <-scan("C:/BioClim/American_Crocodile/bio5.asc", skip=6) var3pred <-scan("C:/BioClim/American_Crocodile/bio6.asc", skip=6) var4pred <-scan("C:/BioClim/American_Crocodile/bio8.asc", skip=6) var5pred <-scan("C:/BioClim/American_Crocodile/bio14.asc", skip=6) var6pred <-scan("C:/BioClim/American_Crocodile/bio15.asc", skip=6) var7pred <-scan("C:/BioClim/American_Crocodile/bio16.asc", skip=6) var8pred <-scan("C:/BioClim/American_Crocodile/bio18.asc", skip=6) var9pred <-scan("C:/BioClim/American_Crocodile/bio19.asc", skip=6) var1pred[var1pred == -9999] = NA ##Assign N/A values from the climate grids## var2pred[var2pred == -9999] = NA var3pred[var3pred == -9999] = NA var4pred[var4pred == -9999] = NA var5pred[var5pred == -9999] = NA var6pred[var6pred == -9999] = NA var7pred[var7pred == -9999] = NA var8pred[var8pred == -9999] = NA var9pred[var9pred == -9999] = NA predict.data <-data.frame(var1pred, var2pred, var3pred, var4pred, var5pred, var6pred, var7pred, var8pred, var9pred) ##Create a data frame with all the climate data## names(predict.data) <-c("BIO2", "BIO5", "BIO6", "BIO8", "BIO14", "BIO15", "BIO16", "BIO18", "BIO19") ##Assign names to climate variables in newly-created data frame, being sure to use the same names as used in the random forest model run above## pred.1 <-predict(test, predict.data, type="link", se.fit=F) ##Extrapolate the model ('test') based on the climate conditions represented in the 'predict.data' data frame## pred.2 <-exp(pred.1)/(1+exp(pred.1)) ##Rescale GLM predictions## pred.3 = na.exclude(pred.2) ##Exclude N/A data from predictions## dummy.null <-c(var1pred) ##Create a dummy vector with as many observations as the full climate grids used for extrapolation## dummy.null[is.na(dummy.null)]<- -9999 ##Assign the N/A values in the dummy vector as -9999## new <-replace(dummy.null, dummy.null > -9999, pred.3) ##Replace the non-N/A values in the dummy vector with the non-N/A predictions from the extrapolation file pred.1## write.table(new, file="C:/BioClim/Revision_Nov_2011/GLM/BioClim/American_Crocodile/temp.txt") ##Save the dummy vector as a data file in vector form## data <-read.table("C:/BioClim/Revision_Nov_2011/GLM/BioClim/American_Crocodile/temp.txt", header=T) ##Read the vector data back into R## data.2 <-data[,1] ##Extract the probabilities from the vector file## hdr = list(ncols=447, nrows=449, xllcorner= -109.1871509552, yllcorner=-32.339998245239, cellsize=0.167, nodata_value = -9999) ##Create a header row of metadata for the prediction map, based on the data from the climate asciis 'var1pred', etc## data.mat <-matrix(data.2, nrow=449, ncol=447, byrow=T) ##Format the vector file as a matrix with row and column numbers as in the climate ascii files## write.ascii.grid(data.mat, "C:/BioClim/Revision_Nov_2011/GLM/BioClim/American_Crocodile/bioclim_pred_map.asc", header= hdr, digits = 6, georef = "corner") ##Save the prediction map as an ascii file## ##The code below calculates AUC and Cohen's kappa for model validation based on 100 random partitions of the occurrence data## library(PresenceAbsence)##Load additional R libraries## library(e1071) random.data <-read.table("C:/BioClim/American_Crocodile/random_points.txt", header=T) ##Add file of climate data from grid cells representing randomly-selected pseudo-absences## data.full <-read.table("C:/BioClim/American_Crocodile/Bioclim_files_July_2012/american_crocodile_all.txt", header=T) ##Add and attach files of climate data from grid cells representing species presences## x <-seq(1, 100, by=1) ##create sequence of 100 items## for (j in x){ ##Set up loop to repeat procedure 100 times## random <-random.data[sample((1:10000),10000),] ##Randomize the pseudo-absence data## random.train <-random[1:7500,] ##Create a training subset with 75% of the pseudo-absence data## random.test <-random[7501:10000,] ##Create a test subset with 25% of the pseudo-absence data## subset <-data.full[sample((1:120),120),] ##Randomize the presence data## pres.train <-subset[1:90,] ##Create a training subset with 75% of the presence data## pres.test <-subset[91:120,] ##Create a test subset with 25% of the presence data## train.data <-rbind(pres.train, random.train) ##Append the training presence subset to a 75% training subset of the pseudo-absence data## test.data <-rbind(pres.test, random.test) ##Append the test presence subset to a 25% test subset of the pseudo-absence data## dataset.glm <-glm(species~BIO2+BIO5+BIO6+BIO8+BIO14+BIO15+BIO16+BIO18+BIO19, family="binomial", data=train.data) ##Run the GLM model## glm.predict <-predict(dataset.glm, test.data, type="response") ##Create prediction with the test data## ##Routine to calculate AUC## roc <-function (test.data, glm.predict){ if (length(test.data) != length(glm.predict)) stop("obs and preds must be equal lengths") n.x <- length(test.data[test.data == 0]) n.y <- length(test.data[test.data == 1]) xy <- c(glm.predict[test.data == 0], glm.predict[test.data == 1]) rnk <- rank(xy) roc <- ((n.x * n.y) + ((n.x * (n.x + 1))/2) - sum(rnk[1:n.x]))/(n.x * n.y) } AUC.output <-roc(test.data[,1], glm.predict) ##AUC value## mat.1 <-data.frame(1:2530, test.data[,1], glm.predict) ##Create data frame with probabilistic predictions from the model## conmat.b <- cmx(mat.1, threshold=0.12) ##Create a confusion matrix based on assigned threshold, calculated using code below## kappa.1<-classAgreement(conmat.b) ##Calculate kappa## kappa.2<-kappa.1["kappa"] ##Extract kappa value## capture.output(kappa.2, file="C:/BioClim/Revision_Nov_2011/GLM/Bioclim/American_Crocodile/kappa_test_july_2012.txt", append=T) ##Save kappa and AUC values to file## capture.output(AUC.output, file="C:/BioClim/Revision_Nov_2011/GLM/Bioclim/American_Crocodile/AUC_july_2012.txt", append=T) if (j== 100) break } ##Code to calculate threshold for threshold-dependent metrics by calculating five replicate estimates of kappa for each 0.01 change in threshold between 0 and 1## x<-seq(.01, 0.99, by=0.01)##Create a vector of probabilities from 0.01 to 0.99)## ##Routine to calculate five replicate estimates of kappa for each 0.01 change in the threshold between 0.01 and 0.99## for (j in x){ filename <-"dataset.txt" for (i in 1:5) { subset <-data.full[sample((1:120),120),] ##Randomize the presence data## training <-subset[1:90,] ##Create a training subset with 75% of the presence data## test <-subset[91:120,] ##Create a test subset with 25% of the presence data## random.subset <-random.data[sample((1:10000),10000),] ##Randomize the pseudo-absence data## training.data <-rbind(training, random.subset[1:7500,]) ##Append the training presence subset to a 75% training subset of the pseudo-absence data## test.data <-rbind(test, random.subset[7501:10000,]) ##Append the test presence subset to a 25% test subset of the pseudo-absence data## dataset.glm <-glm(species~BIO2+BIO5+BIO6+BIO8+BIO14+BIO15+BIO16+BIO18+BIO19, family="binomial", data=training.data) ##Run random forest model with training data## glm.predict <-predict(dataset.glm, test.data, type="response") ##Create prediction with the test data## mat.1 <-data.frame(1:9961, test.data[,1], glm.predict) ##Create data frame with probabilistic predictions from the model## conmat <- cmx(mat.1, threshold= j) ##Create a confusion matrix based on assigned threshold## kappa.1<-classAgreement(conmat) ##Calculate kappa## kappa.2<-kappa.1["kappa"] ##Extract kappa value## capture.output(kappa.2, file="C:/BioClim/Revision_Nov_2011/GLM/Monthly/American_Crocodile/kappa_test_threshold.txt", paste(i,j, kappa.2),append=T) ##Save kappa value to file## if (i ==5) break } if (j== 0.99) break } ##Code for calculating spatial correlation between GLM prediction maps is same as for random forest maps above##